Exploatory data analysis

majors_processed %>%
  count(Major_category, sort = T)
## # A tibble: 16 x 2
##    Major_category                          n
##    <chr>                               <int>
##  1 Engineering                            29
##  2 Education                              16
##  3 Humanities & Liberal Arts              15
##  4 Biology & Life Science                 14
##  5 Business                               13
##  6 Health                                 12
##  7 Computers & Mathematics                11
##  8 Agriculture & Natural Resources        10
##  9 Physical Sciences                      10
## 10 Psychology & Social Work                9
## 11 Social Science                          9
## 12 Arts                                    8
## 13 Industrial Arts & Consumer Services     7
## 14 Law & Public Policy                     5
## 15 Communications & Journalism             4
## 16 Interdisciplinary                       1
by_major_category <- majors_processed %>% 
  filter(!is.na(Total)) %>% 
  group_by(Major_category) %>% 
  summarize(Men = sum(Men),
    Women = sum(Women),
    Total = sum(Total),
    MedianSalary = sum(Median*Sample_size)/sum(Sample_size)) %>% 
  mutate(ShareWomen = Women / Total) %>% 
  arrange(desc(ShareWomen))
majors_processed %>%
  ggplot(aes(Median)) +
  geom_histogram(fill="dodgerblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Metallurgical gender mix

library(ggrepel)
majors_processed %>%
  mutate(Major_category=fct_reorder(Major_category,ShareWomen)) %>% 
  ggplot(aes(fct_lump(Major_category, 20), ShareWomen, fill =
               Major_category),
         label=ShareWomen) +
  geom_boxplot(show.legend = F) +
  coord_flip() +
  ggtitle("Metallurgical mix")+
  theme_bw() +
  geom_hline(yintercept = 0.153, lty = 2)+
  annotate("text",label="15.3% female",x=9,y=0.1,angle=90)+
  xlab("")

Comparing income distributions across major category

majors_processed %>% # BOXPLOT COMPARING EARNINGS ACROSS MAJOR CATEGORY
  mutate(Major_category = fct_reorder(Major_category, Median)) %>%
  ggplot(aes(Major_category, Median,fill=Major_category)) +
  geom_boxplot(show.legend = F) +
  coord_flip() +
  xlab("")+
  scale_y_log10(labels = scales::dollar_format())

Median earnings across major categories

majors_processed %>% 
  group_by(Major_category) %>% 
  summarise(Median=median(Median)) %>% 
  mutate(Major_category = fct_reorder(Major_category, Median)) %>%
  ggplot(aes(Major_category,Median,fill=Major_category))+
  geom_col(show.legend = FALSE)+
  coord_flip()

What are the highest earning majors?

majors_processed %>% # INTERQUARTILE RANGE OF HIGHEST EARNING MAJORS
  arrange(desc(Median)) %>% 
  select(Major,Major_category,Median,P25th,P75th) %>% 
  head(20) %>% 
  mutate(Major=fct_reorder(Major,Median)) %>% 
  ggplot(aes(Major,Median,colour=Major_category))+
  geom_point()+
  coord_flip()+
  geom_errorbar(aes(ymin=P25th,ymax=P75th))+
  expand_limits(y=0)

Business subjects - Highest earning majors

This is of interest with my background in Finance

majors_processed %>% 
  filter(Major_category=='Business') %>% 
  mutate(Major=fct_reorder(Major,Median)) %>% 
  ggplot(aes(Major,Median,fill=Major)) +
  geom_col(show.legend=FALSE)+
  xlab("")+
  coord_flip()+
  ggtitle("Highest earning majors in business")

Male dominated majors

library(ggrepel)
majors_processed %>% 
  arrange(ShareWomen) %>% 
  head(20) %>% 
  mutate(Major=fct_reorder(Major,ShareWomen)) %>% 
  ggplot(aes(Major,ShareWomen))+
  geom_point()+
  coord_flip()

Mapping share of women and median earnings per major

majors_processed %>% 
  filter(Sample_size>30) %>% 
  ggplot(aes(desc(ShareWomen),Median))+
  geom_point()+
  scale_y_log10(labels=scales::dollar_format())+
  geom_smooth(method='lm')+
  geom_text_repel(aes(label=Sample_size,alpha=0.2))+
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

summary(lm(Median~ShareWomen,recent_grads))
## 
## Call:
## lm(formula = Median ~ ShareWomen, data = recent_grads)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17261  -5474  -1007   3502  57604 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    56093       1705   32.90   <2e-16 ***
## ShareWomen    -30670       2987  -10.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9031 on 170 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3828, Adjusted R-squared:  0.3791 
## F-statistic: 105.4 on 1 and 170 DF,  p-value: < 2.2e-16

What are the most common Major Categories?

majors_processed %>% 
  count(Major_category, wt= Total, sort=TRUE) %>% 
  mutate(Major_category = fct_reorder(Major_category, n)) %>% 
  ggplot(aes(Major_category, n, fill = Major_category))+
  geom_col()+
  coord_flip()+
  labs(title="Most common major categories",
    x="",
    y="Total number of graduates"
  )+
  scale_y_continuous(label = scales::comma_format())+
  theme_bw()+
  theme(legend.position="none")

What were the most common Majors?

majors_processed %>% 
  mutate(Major=fct_reorder(Major,Total)) %>% 
  arrange(desc(Total)) %>% 
  head(20) %>% 
  ggplot(aes(Major,Total,fill=Major))+
  geom_col()+
  theme(legend.position = "none")+
  scale_y_continuous(label=scales::comma_format())+
  coord_flip()

How does gender mix relate to typical earnings?

majors_processed %>% 
  arrange(desc(Total)) %>% 
  head(20) %>% 
  mutate(Major=fct_reorder(Major, Total)) %>% 
  gather(Gender, Number, Men, Women) %>% 
  ggplot(aes(Major, Number, fill = Gender))+
  geom_col()+
  scale_y_continuous(label=scales::comma_format())+
  coord_flip()

by_major_category %>% 
  ggplot(aes(ShareWomen,MedianSalary))+
  geom_point()+
  geom_smooth(method="lm")+
  geom_text_repel(aes(label=Major_category),force=0.2)+expand_limits(y=0)+
  labs(title="Relationship between median salary and proportion of women",
       subtitle="Ordered by major category")
## `geom_smooth()` using formula 'y ~ x'

library(plotly)
g <- majors_processed %>% 
  mutate(Major_category=fct_lump(Major_category,7)) %>% 
  ggplot(aes(ShareWomen,Median, colour= Major_category,size=Sample_size))+
  geom_point(aes(label=Major))+
  geom_smooth(aes(group=1),method="lm")+
  expand_limits(y=0)+
  labs(title="Relationship between median salary and proportion of women",
       subtitle = "Ordered by major and coloured by major category",
       x="Percentage share of women in the field",
       y="Median salary income")+
  theme(legend.position = 'bottom')+
  #annotate("text",x=0.8,y=max(majors_processed$Median)-2000,hjust=1,vjust=1,label="Size refers to sample size")+
  #annotate("text",x=0.8,y=max(majors_processed$Median)-8000,hjust=1,vjust=1,label="Colour refers to major category")+
  scale_y_continuous(labels=scales::dollar_format())+
  scale_x_continuous(labels=scales::percent_format())

   
ggplotly(g)
## `geom_smooth()` using formula 'y ~ x'
majors_processed %>% 
  select(Major, Total, ShareWomen, Sample_size, Median) %>% 
  lm(Median ~ ShareWomen, data = ., weights = Sample_size) %>% 
  summary()
## 
## Call:
## lm(formula = Median ~ ShareWomen, data = ., weights = Sample_size)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -260500  -61042  -13899   33262  865081 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    52073       1436  36.255   <2e-16 ***
## ShareWomen    -23650       2403  -9.842   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 123000 on 170 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.363,  Adjusted R-squared:  0.3592 
## F-statistic: 96.87 on 1 and 170 DF,  p-value: < 2.2e-16

Summary findings